Load required libraries

library(corrplot)
library(stats)
library(dendextend)
library(car)  # For ncvTest

Setup and Data Loading

# Load Statistical Analysis Functions 
source("statistical_analysis_functions.R")  # Load test functions

# Quietly move up one directory level from R/ to the project root
suppressMessages(setwd(".."))

# Load data files silently
suppressMessages(suppressWarnings(
  tryCatch({
    # Discovery cohort
    HER2 <- read.csv("data/discovery_cohort/37POI-Her2.csv")
    TN <- read.csv("data/discovery_cohort/37POI-TN.csv")
    ERPR <- read.csv("data/discovery_cohort/37POI-ERPR.csv")
    
    # Validation cohort
    Luminal_A <- read.csv("data/validation_cohort/45POI-Luminal_A.csv")
    Luminal_B <- read.csv("data/validation_cohort/45POI-Luminal_B.csv")
    Basal <- read.csv("data/validation_cohort/45POI-Basal-like.csv")
    HER2E <- read.csv("data/validation_cohort/45POI-Her2.csv")
  }, error = function(e) {
    stop("Error loading data files: ", e$message)
  })
))

Statistical Assumption Testing

# Test assumptions for each cohort
cat("\n=== DISCOVERY COHORT ===\n")
## 
## === DISCOVERY COHORT ===
her2_summary <- summarise_assumptions(HER2, "HER2")
## 
## Analysing HER2 
## ----------------------------------------
## Normality: 73.0% passed
## Linearity: 11 strong correlations
## Homoscedasticity: 90.5% passed
tn_summary <- summarise_assumptions(TN, "Triple Negative")
## 
## Analysing Triple Negative 
## ----------------------------------------
## Normality: 70.3% passed
## Linearity: 37 strong correlations
## Homoscedasticity: 93.7% passed
erpr_summary <- summarise_assumptions(ERPR, "ERPR")
## 
## Analysing ERPR 
## ----------------------------------------
## Normality: 70.3% passed
## Linearity: 18 strong correlations
## Homoscedasticity: 93.5% passed
cat("\n=== VALIDATION COHORT ===\n")
## 
## === VALIDATION COHORT ===
luma_summary <- summarise_assumptions(Luminal_A, "Luminal A")
## 
## Analysing Luminal A 
## ----------------------------------------
## Normality: 82.6% passed
## Linearity: 20 strong correlations
## Homoscedasticity: 91.8% passed
lumb_summary <- summarise_assumptions(Luminal_B, "Luminal B")
## 
## Analysing Luminal B 
## ----------------------------------------
## Normality: 73.9% passed
## Linearity: 30 strong correlations
## Homoscedasticity: 90.1% passed
basal_summary <- summarise_assumptions(Basal, "Basal-like")
## 
## Analysing Basal-like 
## ----------------------------------------
## Normality: 80.4% passed
## Linearity: 45 strong correlations
## Homoscedasticity: 87.4% passed
her2e_summary <- summarise_assumptions(HER2E, "HER2-enriched")
## 
## Analysing HER2-enriched 
## ----------------------------------------
## Normality: 82.6% passed
## Linearity: 152 strong correlations
## Homoscedasticity: 98.6% passed

Generate Pearson Correlation Visualisations with CPNE3 Correlations

# Calculate correlations
cor_HER2 <- cor(HER2[,-1], method="pearson")
cor_TN <- cor(TN[,-1], method="pearson")
cor_ERPR <- cor(ERPR[,-1], method="pearson")

# Discovery cohort plots and CPNE3 correlations
par(mfrow=c(2,2), oma=c(2,2,2,2))  # Set outer margins
plot_correlation_with_clusters(cor_HER2, "HER2 Subtype Correlations")
plot_correlation_with_clusters(cor_TN, "Triple Negative Subtype Correlations")
plot_correlation_with_clusters(cor_ERPR, "ER/PR Subtype Correlations")

# Print CPNE3 correlation analysis header
cat("\nCPNE3 CORRELATION ANALYSIS - DISCOVERY COHORTS\n")
## 
## CPNE3 CORRELATION ANALYSIS - DISCOVERY COHORTS
cat("===================================================\n")
## ===================================================
# Get CPNE3 correlations for each cohort
her2_cpne3 <- get_cpne3_correlations(HER2, "HER2")
## 
## Significant CPNE3 correlations in HER2 cohort:
## ----------------------------------------
## Protein  Correlation P-Value
## SLC25A3  0.753   1.182e-03
## CACYBP   0.718   2.573e-03
## PRKCSH   0.643   9.673e-03
## RPL19    0.640   1.016e-02
## BOLA2    0.626   1.249e-02
## CANX 0.551   3.315e-02
## LRPPRC   0.540   3.762e-02
## HSPE1    0.526   4.402e-02
## ATP5B    0.521   4.626e-02
## EIF5A    0.516   4.877e-02
tn_cpne3 <- get_cpne3_correlations(TN, "Triple Negative")
## 
## Significant CPNE3 correlations in Triple Negative cohort:
## ----------------------------------------
## Protein  Correlation P-Value
## RHOC 0.806   2.763e-03
## CALML3   0.764   6.231e-03
## PRDX3    -0.705  1.550e-02
## CS   0.663   2.622e-02
erpr_cpne3 <- get_cpne3_correlations(ERPR, "ERPR")
## 
## Significant CPNE3 correlations in ERPR cohort:
## ----------------------------------------
## Protein  Correlation P-Value
## HIST1H4A -0.860  8.112e-05
## EIF5A    -0.727  3.191e-03
## RPL19    -0.579  2.991e-02

# Calculate validation cohort correlations
cor_LumA <- cor(Luminal_A[,-1], method="pearson")
cor_LumB <- cor(Luminal_B[,-1], method="pearson")
cor_Basal <- cor(Basal[,-1], method="pearson")
cor_HER2E <- cor(HER2E[,-1], method="pearson")

# First create all plots
par(mfrow=c(2,2), oma=c(2,2,2,2))  # Set outer margins
plot_correlation_with_clusters(cor_LumA, "Luminal A Correlations")
plot_correlation_with_clusters(cor_LumB, "Luminal B Correlations")
plot_correlation_with_clusters(cor_Basal, "Basal-like Correlations")
plot_correlation_with_clusters(cor_HER2E, "HER2-enriched Correlations")

# Print CPNE3 correlation analysis header
cat("\nCPNE3 CORRELATION ANALYSIS - VALIDATION COHORTS\n")
## 
## CPNE3 CORRELATION ANALYSIS - VALIDATION COHORTS
cat("===================================================\n")
## ===================================================
# Get and display CPNE3 correlations for validation cohorts
luma_cpne3 <- get_cpne3_correlations(Luminal_A, "Luminal A")
## 
## Significant CPNE3 correlations in Luminal A cohort:
## ----------------------------------------
## Protein  Correlation P-Value
## PPP6R2   0.486   1.874e-02
## KARS -0.415  4.921e-02
lumb_cpne3 <- get_cpne3_correlations(Luminal_B, "Luminal B")
## 
## Significant CPNE3 correlations in Luminal B cohort:
## ----------------------------------------
## Protein  Correlation P-Value
## LRPPRC   0.505   5.213e-03
## EIF5A    0.425   2.165e-02
## VDAC2    0.417   2.432e-02
## RANBP1   0.393   3.481e-02
## PRDX3    0.371   4.754e-02
basal_cpne3 <- get_cpne3_correlations(Basal, "Basal-like")
## 
## Significant CPNE3 correlations in Basal-like cohort:
## ----------------------------------------
## Protein  Correlation P-Value
her2e_cpne3 <- get_cpne3_correlations(HER2E, "HER2-enriched")
## 
## Significant CPNE3 correlations in HER2-enriched cohort:
## ----------------------------------------
## Protein  Correlation P-Value
## HIST1H4A -0.849  1.557e-02
## PYGM 0.849   1.566e-02
## EIF5A    0.802   2.999e-02

Generate Spearman Correlation Visualisations with CPNE3 Correlations

# Calculate Spearman correlations
spear_HER2 <- cor(HER2[,-1], method="spearman")
spear_TN <- cor(TN[,-1], method="spearman")
spear_ERPR <- cor(ERPR[,-1], method="spearman")

# Discovery cohort Spearman plots
par(mfrow=c(2,2), oma=c(2,2,2,2))  # Set outer margins
plot_correlation_with_clusters(spear_HER2, "HER2 Subtype Spearman Correlations")
plot_correlation_with_clusters(spear_TN, "Triple Negative Subtype Spearman Correlations")
plot_correlation_with_clusters(spear_ERPR, "ER/PR Subtype Spearman Correlations")

# Print CPNE3 correlation analysis
cat("\nCPNE3 SPEARMAN CORRELATION ANALYSIS - DISCOVERY COHORTS\n")
## 
## CPNE3 SPEARMAN CORRELATION ANALYSIS - DISCOVERY COHORTS
cat("===================================================\n")
## ===================================================
cat(get_spearman_cpne3_correlations(HER2, "HER2"))
## 
## Significant CPNE3 Spearman correlations in HER2 cohort:
## ----------------------------------------
## Protein  Correlation P-Value
## BOLA2    0.696   3.921e-03
## CANX 0.579   2.385e-02
## HSPE1    0.571   2.606e-02
## PRKCSH   0.559   3.013e-02
## LRPPRC   0.552   3.278e-02
## KARS 0.552   3.289e-02
## RPL19    0.539   3.802e-02
cat(get_spearman_cpne3_correlations(TN, "Triple Negative"))
## 
## Significant CPNE3 Spearman correlations in Triple Negative cohort:
## ----------------------------------------
## Protein  Correlation P-Value
## PRDX3    -0.864  6.117e-04
## APRT -0.729  1.093e-02
## RPL19    0.655   2.886e-02
## ATP5B    -0.630  3.770e-02
## ATP5D    -0.618  4.265e-02
cat(get_spearman_cpne3_correlations(ERPR, "ERPR"))
## 
## Significant CPNE3 Spearman correlations in ERPR cohort:
## ----------------------------------------
## Protein  Correlation P-Value
## HIST1H4A -0.836  1.948e-04
## EIF5A    -0.683  7.118e-03

# Calculate validation cohort Spearman correlations
spear_LumA <- cor(Luminal_A[,-1], method="spearman")
spear_LumB <- cor(Luminal_B[,-1], method="spearman")
spear_Basal <- cor(Basal[,-1], method="spearman")
spear_HER2E <- cor(HER2E[,-1], method="spearman")

# Validation cohort Spearman plots
par(mfrow=c(2,2), oma=c(2,2,2,2))  # Set outer margins
plot_correlation_with_clusters(spear_LumA, "Luminal A Spearman Correlations")
plot_correlation_with_clusters(spear_LumB, "Luminal B Spearman Correlations")
plot_correlation_with_clusters(spear_Basal, "Basal-like Spearman Correlations")
plot_correlation_with_clusters(spear_HER2E, "HER2-enriched Spearman Correlations")

# Print CPNE3 correlation analysis
cat("\nCPNE3 SPEARMAN CORRELATION ANALYSIS - VALIDATION COHORTS\n")
## 
## CPNE3 SPEARMAN CORRELATION ANALYSIS - VALIDATION COHORTS
cat("===================================================\n")
## ===================================================
cat(get_spearman_cpne3_correlations(Luminal_A, "Luminal A"))
## 
## Significant CPNE3 Spearman correlations in Luminal A cohort:
## ----------------------------------------
## Protein  Correlation P-Value
## KARS -0.456  2.893e-02
## PPP6R2   0.438   3.670e-02
## HIST1H1E -0.420  4.603e-02
## HIST1H1A -0.419  4.660e-02
cat(get_spearman_cpne3_correlations(Luminal_B, "Luminal B"))
## 
## Significant CPNE3 Spearman correlations in Luminal B cohort:
## ----------------------------------------
## Protein  Correlation P-Value
## LRPPRC   0.509   4.818e-03
## PRDX3    0.494   6.507e-03
## CS   0.459   1.234e-02
## VDAC2    0.447   1.498e-02
## RANBP1   0.393   3.489e-02
## EIF5A    0.380   4.185e-02
cat(get_spearman_cpne3_correlations(Basal, "Basal-like"))
## 
## Significant CPNE3 Spearman correlations in Basal-like cohort:
## ----------------------------------------
## Protein  Correlation P-Value
cat(get_spearman_cpne3_correlations(HER2E, "HER2-enriched"))
## 
## Significant CPNE3 Spearman correlations in HER2-enriched cohort:
## ----------------------------------------
## Protein  Correlation P-Value
## EIF5A    0.857   1.370e-02
## APRT 0.821   2.345e-02
## PYGM 0.821   2.345e-02
## PPP6R2   -0.786  3.624e-02

Save Correlation Results and Plots to Output Folder

# Create organized output directories in project root
dir.create("../outputs/correlation_plots", recursive = TRUE, showWarnings = FALSE)
dir.create("../outputs/results", recursive = TRUE, showWarnings = FALSE)

# Discovery cohort plots and correlations
pdf("../outputs/correlation_plots/discovery_cohort_correlations.pdf", width=12, height=12)
par(mfrow=c(2,2), oma=c(2,2,2,2))  # Set outer margins
suppressMessages(suppressWarnings({
  plot_correlation_with_clusters(cor_HER2, "HER2 Subtype Correlations")
  plot_correlation_with_clusters(cor_TN, "Triple Negative Subtype Correlations")
  plot_correlation_with_clusters(cor_ERPR, "ER/PR Subtype Correlations")
}))
invisible(dev.off())

# Save validation cohort plots and correlations
pdf("../outputs/correlation_plots/validation_cohort_correlations.pdf", width=12, height=12)
par(mfrow=c(2,2), oma=c(2,2,2,2))
suppressMessages(suppressWarnings({
  plot_correlation_with_clusters(cor_LumA, "Luminal A Correlations")
  plot_correlation_with_clusters(cor_LumB, "Luminal B Correlations")
  plot_correlation_with_clusters(cor_Basal, "Basal-like Correlations")
  plot_correlation_with_clusters(cor_HER2E, "HER2-enriched Correlations")
}))
invisible(dev.off())

# Save discovery cohorts correlation analysis
discovery_results <- suppressMessages(suppressWarnings(
  paste0(
    "CPNE3 CORRELATION ANALYSIS - DISCOVERY COHORTS\n",
    "=============================================\n",
    capture.output(get_cpne3_correlations(HER2, "HER2")),
    capture.output(get_cpne3_correlations(TN, "Triple Negative")),
    capture.output(get_cpne3_correlations(ERPR, "ERPR"))
  )
))
suppressMessages(writeLines(discovery_results, "../outputs/results/discovery_cohort_correlations.txt"))

# Save validation cohorts correlation analysis
validation_results <- suppressMessages(suppressWarnings(
  paste0(
    "CPNE3 CORRELATION ANALYSIS - VALIDATION COHORTS\n",
    "=============================================\n",
    capture.output(get_cpne3_correlations(Luminal_A, "Luminal A")),
    capture.output(get_cpne3_correlations(Luminal_B, "Luminal B")),
    capture.output(get_cpne3_correlations(Basal, "Basal-like")),
    capture.output(get_cpne3_correlations(HER2E, "HER2-enriched"))
  )
))
suppressMessages(writeLines(validation_results, "../outputs/results/validation_cohort_correlations.txt"))

# Save discovery cohort Spearman plots
pdf("../outputs/correlation_plots/discovery_cohort_spearman_correlations.pdf", width=12, height=12)
par(mfrow=c(2,2), oma=c(2,2,2,2))
suppressMessages(suppressWarnings({
  plot_correlation_with_clusters(spear_HER2, "HER2 Subtype Spearman Correlations")
  plot_correlation_with_clusters(spear_TN, "Triple Negative Subtype Spearman Correlations")
  plot_correlation_with_clusters(spear_ERPR, "ER/PR Subtype Spearman Correlations")
}))
invisible(dev.off())

# Save discovery cohort Spearman correlation analysis
discovery_results <- suppressMessages(suppressWarnings(
  paste0(
    "CPNE3 SPEARMAN CORRELATION ANALYSIS - DISCOVERY COHORTS\n",
    "===================================================\n",
    capture.output(get_spearman_cpne3_correlations(HER2, "HER2")),
    capture.output(get_spearman_cpne3_correlations(TN, "Triple Negative")),
    capture.output(get_spearman_cpne3_correlations(ERPR, "ERPR"))
  )
))
suppressMessages(writeLines(discovery_results, "../outputs/results/discovery_cohort_spearman_correlations.txt"))

# Save validation cohort Spearman plots
pdf("../outputs/correlation_plots/validation_cohort_spearman_correlations.pdf", width=12, height=12)
par(mfrow=c(2,2), oma=c(2,2,2,2))
suppressMessages(suppressWarnings({
  plot_correlation_with_clusters(spear_LumA, "Luminal A Spearman Correlations")
  plot_correlation_with_clusters(spear_LumB, "Luminal B Spearman Correlations")
  plot_correlation_with_clusters(spear_Basal, "Basal-like Spearman Correlations")
  plot_correlation_with_clusters(spear_HER2E, "HER2-enriched Spearman Correlations")
}))
invisible(dev.off())

# Save validation cohort Spearman correlation analysis
validation_results <- suppressMessages(suppressWarnings(
  paste0(
    "CPNE3 SPEARMAN CORRELATION ANALYSIS - VALIDATION COHORTS\n",
    "===================================================\n",
    capture.output(get_spearman_cpne3_correlations(Luminal_A, "Luminal A")),
    capture.output(get_spearman_cpne3_correlations(Luminal_B, "Luminal B")),
    capture.output(get_spearman_cpne3_correlations(Basal, "Basal-like")),
    capture.output(get_spearman_cpne3_correlations(HER2E, "HER2-enriched"))
  )
))
suppressMessages(writeLines(validation_results, "../outputs/results/validation_cohort_spearman_correlations.txt"))